
                                                        Read me file by Caterina Froio 
         Paper " Race, religion or culture? Framing Islam between racism and neo-racism in the online network of the French far-right"
                                                  Perspectives on Politics (POP-D-17-00028R3)
                                                Author: Caterina Froio (caterina.froio@eui.eu)

"""""" These instructions will get you running the project on your local machine for development and testing purposes. 

Softwares required: 
 
 1) Hyphe: to build the web corpora. It is freely available here http://hyphe.medialab.sciences-po.fr/ 
 2) Gephy: to visualize and explore the network of websites. It is freely available here https://gephi.org/users/download/ 
 3) Python: to scrape the content of the websites. It is freely available here https://www.python.org/downloads/ 
 4) Scrapy : it is web a crawling framework, written in Python. It is freely available here  https://scrapy.org/download/

To visualize and explore network statistics you do not need to write scripts as the interfaces of Hyphe and Gephy are very user-friendly. Just use the website attached to the paper including the web corpora done with Hyphe and 
ask for SNA measures reported in the paper.
For the content analysis of the websites you need 3 scripts: A. For crawling; B.for extracting all text from the websites; C. for keywords search. Here are mine prepared with Python 3.6.5 and Scrapy 1.5.0. 
Enjoy! """"""


"""""" 
    A. CRAWLING 
    Scrapy spider for Caterina's project "Race, religion or culture?".
    Requirements : scrapy 1.5.0

    Usage : python crawl.py (or whatever name you give to this file)

    Configuration :

    Feel free to modify:
         - WORKING_FOLDER
         - conf.json

"""""" 

__author__ = "Caterina Froio"
__copyright__ = "Copyright 2018, CurrantSolutions LTD"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Caterina Froio"
__email__ = "caterina.froio@eui.eu"


import os
import json
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner

WORKING_FOLDER = './'


class CaterinaSpider(CrawlSpider):

    def __init__(self, name, allowed_domains=[],
                 start_urls=[], *args, **kwargs):
        self.name = name
        self.allowed_domains = allowed_domains
        self.start_urls = start_urls
        self.rules = (
            Rule(LinkExtractor(
                 allow=(allowed_domains)),
                 callback='parse_item'),
        )
        self.cnt = 0
        super(CaterinaSpider, self).__init__(*args, **kwargs)

    def parse_item(self, response):
        print('invoked')
        self.save_page_body(response)

    def save_page_body(self, response):
        page = response.url.split("/")[-2]
        folder = WORKING_FOLDER + self.name
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = '%s/%d-%s.html' % (folder, self.cnt, page)
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.cnt = self.cnt + 1
        self.log('Saved file %s' % filename)


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'ROBOTSTXT_OBEY': 'False'
})

with open('conf.json') as json_data:
    conf = json.load(json_data)

runner = CrawlerRunner()

for cfg in conf['websites']:
    website_name = cfg['name']
    website_domains = cfg['domains']
    website_start_url = cfg['start_url']

    runner.crawl(CaterinaSpider, name=website_name,
                 allowed_domains=website_domains,
                 start_urls=website_start_url)


d = runner.join()
d.addBoth(lambda _: reactor.stop())
#reactor.callFromThread(notThreadSafe, 3)
reactor.run() #it    reactor.run()


""""""  
    B. EXTRACTING ALL TEXT FROM THE WEBSITES 
    Websites content extractor for Caterina's project "Race, religion or culture?".
    Requirements : scrapy 1.5.0

    Usage : python crawl.py (or whatever name you give to this file)

    Configuration :

    Feel free to modify:
         - WORKING_FOLDER
         - conf.json

"""""" 


{
	"downloaded" :[
		{
			"name" : "egaliteetreconciliation",
		 	"domains" : ["www.egaliteetreconciliation.fr"],
		 	"start_url" : ["https://www.egaliteetreconciliation.fr"]
		},{
			"name" : "gud-lyon",
		 	"domains" : ["gud-lyon.blogspot.fr"],
		 	"start_url" : ["http://gud-lyon.blogspot.fr"]
		},{
			"name" : "civitasinstitut",
		 	"domains" : ["www.civitas-institut.com"],
		 	"start_url" : ["http://www.civitas-institut.com/"]
	     },{
			"name" : "Liguedefensejuive",
		 	"domains" : ["www.liguedefensejuive.com"],
		 	"start_url" : ["http://www.liguedefensejuive.com/"]
		},{
			"name" : "Frontnational",
		 	"domains" : ["www.frontnational.com"],
		 	"start_url" : ["http://www.frontnational.com/"]
		},{
			"name" : "jeunenation",
		 	"domains" : ["www.jeune-nation.com"],
		 	"start_url" : ["https://www.jeune-nation.com/"]
		},{
			"name" : "Liguedumidi",
		 	"domains" : ["liguedumidi.com"],
		 	"start_url" : ["https://liguedumidi.com/"]
		}, {
			"name" : "Partiantisioniste",
		 	"domains" : ["www.partiantisioniste.com"],
		 	"start_url" : ["http://www.partiantisioniste.com"]
		},{
			"name" : "Robertfaurisson",
		 	"domains" : ["robertfaurisson.blogspot.fr"],
		 	"start_url" : ["http://robertfaurisson.blogspot.fr"]
		},{
			"name" : "Soschretiensdorient",
		 	"domains" : ["www.soschretiensdorient.fr"],
		 	"start_url" : ["http://www.soschretiensdorient.fr/"]
        },{
			"name" : "Renouveau-Francais",
		 	"domains" : ["renouveau-francais.com"],
		 	"start_url" : ["http://renouveau-francais.com/"]
        },{
			"name" : "Renaud-Camus",
		 	"domains" : ["www.renaud-camus.net"],
		 	"start_url" : ["https://www.renaud-camus.net/"]
        },{
			"name" : "Sos-Tout-Petits",
		 	"domains" : ["www.sos-tout-petits.org"],
		 	"start_url" : ["http://www.sos-tout-petits.org/"]
		 },{
			"name" : "Ichtus",
		 	"domains" : ["www.ichtus.fr/"],
		 	"start_url" : ["http://www.ichtus.fr/"]
		},{
			"name" : "Ripostelaique",
		 	"domains" : ["ripostelaique.com"],
		 	"start_url" : ["https://ripostelaique.com/"]
		},{
			"name" : "Rbmfrance",
		 	"domains" : ["www.rbleumarine.fr"],
		 	"start_url" : ["https://www.rbleumarine.fr/"]
		},{
			"name" : "Siel-Souverainete",
		 	"domains" : ["www.siel-souverainete.fr"],
		 	"start_url" : ["http://www.siel-souverainete.fr/"]
		},{
			"name" : "Englishdefenceleague",
		 	"domains" : ["www.englishdefenceleague.org.uk"],
		 	"start_url" : ["http://www.englishdefenceleague.org.uk"]
		 },{
			"name" : "Frontdespatriotes",
		 	"domains" : ["fdespatriotes.centerblog.net"],
		 	"start_url" : ["http://fdespatriotes.centerblog.net/"]
		 	
		 },{
			"name" : "udcSwiss",
		 	"domains" : ["www.udc.ch/parti/"],
		 	"start_url" : ["https://www.udc.ch/parti/"]
		 	
		 },{
			"name" : "Terreetpeuple",
		 	"domains" : ["www.terreetpeuple.com"],
		 	"start_url" : ["http://www.terreetpeuple.com/"]
		 
		 },{
			"name" : "M-N-R",
		 	"domains" : ["www.m-n-r.fr"],
		 	"start_url" : ["http://www.m-n-r.fr/"]
		 
		 },{
			"name" : "Vlaamsbelang",
		 	"domains" : ["www.vlaamsbelang.org"],
		 	"start_url" : ["https://www.vlaamsbelang.org/"]
		 
		 },{
			"name" : "Nationalite-citoyennete-identite",
		 	"domains" : ["www.nationalite-citoyennete-identite.com"],
		 	"start_url" : ["http://www.nationalite-citoyennete-identite.com/"]
		 
		 },{
			"name" : "Zentropa.Info",
		 	"domains" : ["zentropa.info"],
		 	"start_url" : ["http://zentropa.info/"]
		 
		 },{
			"name" : "Resistancerepublicaine",
		 	"domains" : ["resistancerepublicaine.eu"],
		 	"start_url" : ["http://resistancerepublicaine.eu/"]
		 
		 },{
			"name" : "Riposte-catholique",
		 	"domains" : ["www.riposte-catholique.fr"],
		 	"start_url" : ["http://www.riposte-catholique.fr/"]
		 
		 },{
			"name" : "Secours-catholique",
		 	"domains" : ["www.secours-catholique.org"],
		 	"start_url" : ["http://www.secours-catholique.org/"]
		 
		 },{
			"name" : "Renaissancecatholique",
		 	"domains" : ["www.renaissancecatholique.org"],
		 	"start_url" : ["http://www.renaissancecatholique.org/"]
		 
		 },{
			"name" : "Upr",
		 	"domains" : ["www.u-p-r.fr"],
		 	"start_url" : ["http://www.u-p-r.fr"]
		 
		 },{
			"name" : "Vigilancehallal",
		 	"domains" : ["vigilancehallal.com"],
		 	"start_url" : ["http://vigilancehallal.com"]
		 
		 },{
			"name" : "Collectifmarianne",
		 	"domains" : ["www.collectifmarianne.fr"],
		 	"start_url" : ["http://www.collectifmarianne.fr/"]
		 
		 },{
			"name" : "Actionroyaliste",
		 	"domains" : ["www.actionroyaliste.fr"],
		 	"start_url" : ["https://www.actionroyaliste.fr/"]
		 
		 },{
			"name" : "Christianophobie",
		 	"domains" : ["www.christianophobie.fr"],
		 	"start_url" : ["http://www.christianophobie.fr"]
		 
		 },{
			"name" : "Clubdelhorloge",
		 	"domains" : ["www.clubdelhorloge.fr"],
		 	"start_url" : ["http://www.clubdelhorloge.fr/"]
		 
		 },{
			"name" : "Dominiquevenner",
		 	"domains" : ["www.dominiquevenner.fr"],
		 	"start_url" : ["https://www.dominiquevenner.fr/"]
		
		},{
			"name" : "Alaindebenoist",
		 	"domains" : ["www.alaindebenoist.com"],
		 	"start_url" : ["https://www.alaindebenoist.com/"]
		 
		 },{
			"name" : "Catholique etudiants toulouse",
		 	"domains" : ["etudiants-toulouse.catholique.fr"],
		 	"start_url" : ["http://etudiants-toulouse.catholique.fr/"]
		 
		 },{
			"name" : "Belle-et-rebelle",
		 	"domains" : ["www.blocidentitaire-idf.com"],
		 	"start_url" : ["http://www.blocidentitaire-idf.com/bloc-identitaire-idf/belle-et-rebelle/"]
		 
		 },{
			"name" : "Liguefrancilienne",
		 	"domains" : ["liguefrancilienne.com"],
		 	"start_url" : ["https://liguefrancilienne.com/"]
		},{
			"name" : "Meridienzero",
		 	"domains" : ["radiomz.org"],
		 	"start_url" : ["https://radiomz.org/"]
		},{
			"name" : "Charentelibre",
		 	"domains" : ["charentelibre.fr"],
		 	"start_url" : ["http://www.charentelibre.fr/"]
		
		},{
			"name" : "Laportelatine",
		 	"domains" : ["laportelatine.org"],
		 	"start_url" : ["http://laportelatine.org"]

		},{
			"name" : "Parti-de-la-france",
		 	"domains" : ["parti-de-la-france.fr"],
		 	"start_url" : ["http://www.parti-de-la-france.fr/"]
		},{
			"name" : "Catho-bruxelles",
		 	"domains" : ["catho-bruxelles.be"],
		 	"start_url" : ["http://www.catho-bruxelles.be/"]
		},{
			"name" : "Bloc-Identitaire",
		 	"domains" : ["les-identitaires.com"],
		 	"start_url" : ["http://www.les-identitaires.com/"]
		
		},{
			"name" : "France-catholique",
		 	"domains" : ["france-catholique.fr"],
		 	"start_url" : ["https://www.france-catholique.fr/"]
		},{
			"name" : "Jeunes-cathos",
		 	"domains" : ["blog.jeunes-cathos.fr"],
		 	"start_url" : ["http://blog.jeunes-cathos.fr/"]
		},{
			"name" : "entraide-solidarite",
		 	"domains" : ["entraide-solidarite.com"],
		 	"start_url" : ["https://www.entraide-solidarite.com/"]
		},{
			"name" : "Europemaxima",
		 	"domains" : ["europemaxima.com"],
		 	"start_url" : ["http://www.europemaxima.com/"]
		},{
			"name" : "Nissarebela",
		 	"domains" : ["liberanissa.eu"],
		 	"start_url" : ["http://www.liberanissa.eu"]
		},{
			"name" : "Jeunesactifs-patriotes",
		 	"domains" : ["jeunesactifs-patriotes.fr"],
		 	"start_url" : ["http://jeunesactifs-patriotes.fr/"]
		},{
			"name" : "Jourdecolere",
		 	"domains" : ["jourdecolere.com"],
		 	"start_url" : ["http://www.jourdecolere.com/"]
		},{
			"name" : "laissezlesvivre",
		 	"domains" : ["laissezlesvivre.free.fr"],
		 	"start_url" : ["http://laissezlesvivre.free.fr/"]
		},{
			"name" : "lespatriotes",
		 	"domains" : ["lespatriotes.net"],
		 	"start_url" : ["https://lespatriotes.net"]
        },{
			"name" : "Antigones",
		 	"domains" : ["antigones.fr"],
		 	"start_url" : ["https://antigones.fr/"]
        },{
			"name" : "islamisation",
		 	"domains" : ["islamisation.fr/"],
		 	"start_url" : ["http://islamisation.fr/"]
        },{
			"name" : "Gollnish",
		 	"domains" : ["gollnisch.com"],
		 	"start_url" : ["http://www.gollnisch.com/"]
        },{
			"name" : "Bloccostudentesco",
		 	"domains" : ["bloccostudentesco.org"],
		 	"start_url" : ["http://www.bloccostudentesco.org/"]
        },{
			"name" : "caryatides",
		 	"domains" : ["caryatides.fr"],
		 	"start_url" : ["http://caryatides.fr"]
        },{
			"name" : "casapounditalia",
		 	"domains" : ["casapounditalia.org"],
		 	"start_url" : ["http://www.casapounditalia.org/"]
        },{
			"name" : "Jeanmarielepen",
		 	"domains" : ["jeanmarielepen.com"],
		 	"start_url" : ["http://www.jeanmarielepen.com/"]
        },{
			"name" : "academiecatholiquedefrance",
		 	"domains" : ["academiecatholiquedefrance.fr"],
		 	"start_url" : ["http://www.academiecatholiquedefrance.fr/"]
        },{
			"name" : "Manifpourtous",
		 	"domains" : ["nord.manifpourtous.com"],
		 	"start_url" : ["http://nord.manifpourtous.com/"]
        },{
			"name" : "Parti-nationaliste-francais",
		 	"domains" : ["parti-nationaliste-francais.com"],
		 	"start_url" : ["http://parti-nationaliste-francais.com/"]
        },{
			"name" : "Parti-nationaliste-francais",
		 	"domains" : ["parti-nationaliste-francais.com"],
		 	"start_url" : ["http://parti-nationaliste-francais.com/"]
        },{
			"name" : "Reconquetefrancaise",
		 	"domains" : ["deus-vult.org"],
		 	"start_url" : ["https://deus-vult.org"]
        },{
			"name" : "dieudosphere",
		 	"domains" : ["dieudosphere.tumblr.com"],
		 	"start_url" : ["http://dieudosphere.tumblr.com/"]
        },{
			"name" : "lamanifpourtoustoulouse",
		 	"domains" : ["lamanifpourtoustoulouse.blogspot.fr"],
		 	"start_url" : ["http://lamanifpourtoustoulouse.blogspot.fr/"]
        },{
			"name" : "jeunessesnationalistes",
		 	"domains" : ["lorrainenationaliste.com"],
		 	"start_url" : ["https://lorrainenationaliste.com"]
        },{
			"name" : "reseauidentites",
		 	"domains" : ["liguefrancilienne.com"],
		 	"start_url" : ["https://liguefrancilienne.com/"]
        },{
			"name" : "Defrancisation",
		 	"domains" : ["islamisme.fr"],
		 	"start_url" : ["http://www.islamisme.fr/islamisation/www-defrancisation-com/"]
        }


		

	],

	"websites" : [

	 ]
}


"""""" 
    C. FOR KEYWORDS SEARCH
    Search tool for Caterina's project.

    Usage : python searcher.py path query

    - path  : (root directory where you saved your files)
    - query : expression to search
    
     examples:
       python searcher.py .  'pour rappel imam'
       python searcher.py .  Muslim

    Keywords used: Islam, Muslim, mosque, imam, Qur�an (Quran, Qur�an, Koran, Alcoran or Al-Qur�an), headscarf, burqa (burkha,
     burka or burqua), minaret.

"""""" 


__author__ = "Caterina Froio"
__copyright__ = "Copyright 2018, CurrantSolutions LTD"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Caterina Froio"
__email__ = "caterina.froio@eui.eu"


import os
import re
import sys
import collections
import string
import cchardet as chardet

table = collections.defaultdict(lambda: None)
table.update({
    ord('�'): 'e',
    ord('�'): 'o',
    ord(' '): ' ',
    ord('\N{NO-BREAK SPACE}'): ' ',
    ord('\N{EN SPACE}'): ' ',
    ord('\N{EM SPACE}'): ' ',
    ord('\N{THREE-PER-EM SPACE}'): ' ',
    ord('\N{FOUR-PER-EM SPACE}'): ' ',
    ord('\N{SIX-PER-EM SPACE}'): ' ',
    ord('\N{FIGURE SPACE}'): ' ',
    ord('\N{PUNCTUATION SPACE}'): ' ',
    ord('\N{THIN SPACE}'): ' ',
    ord('\N{HAIR SPACE}'): ' ',
    ord('\N{ZERO WIDTH SPACE}'): ' ',
    ord('\N{NARROW NO-BREAK SPACE}'): ' ',
    ord('\N{MEDIUM MATHEMATICAL SPACE}'): ' ',
    ord('\N{IDEOGRAPHIC SPACE}'): ' ',
    ord('\N{IDEOGRAPHIC HALF FILL SPACE}'): ' ',
    ord('\N{ZERO WIDTH NO-BREAK SPACE}'): ' ',
    ord('\N{TAG SPACE}'): ' '})

table.update(dict(zip(map(ord,
                          string.ascii_uppercase),
                      string.ascii_lowercase)))
table.update(dict(zip(map(ord,
                          string.ascii_lowercase),
                      string.ascii_lowercase)))
table.update(dict(zip(map(ord, string.digits), string.digits)))


class Searcher(object):
    def __init__(self, path, query):
        self.path = path
        self.query = query
        self.searched = {}
        self.total = 0

    def find(self, path=None):
        if path is None:
            path = self.path
        for(root, dirs, files) in os.walk(path):
            for file in files:
                if re.match(r'.*?\.html$', file) is not None:
                    try:
                        file_path = os.path.join(root, file)
                        f = open(file_path, 'rb')
                        enc = chardet.detect(f.read())['encoding']
                        f.close()
                        f = open(file_path, 'rt', encoding=enc)
                        #txt = (f.read()).translate(table,)
                        txt = (f.read())
                        f.close()

                        count = len(re.findall('(?i)' + self.query, txt))
                        if count > 0:
                            self.searched[file_path] = count
                            self.total = self.total + count
                    except Exception:
                        print('invalid encoding : ' + file_path)
                        pass

    def getResults(self):
        return self.searched

    def getTotalOccurences(self):
        return self.total


if __name__ == '__main__':

    if len(sys.argv) != 3:
        print("Usage : %s <path> <keyword>" % (sys.argv[0]))
    else:
        path = os.path.dirname(os.path.realpath(sys.argv[1]))
        query = sys.argv[2]
        search = Searcher(path, query)
        search.find()
        results = search.getResults()

        for file, count in results.items():
            print('File: [%s], Found entries: %d' % (file, count))
        print (' ----> Found %d files  - %d occurences' %
               (len(results), search.getTotalOccurences()))


